#Importing necessary packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report,roc_auc_score
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
import os
os.chdir (r"D:\anaconda\ashok\Project\UnsupervisedLearning")
import warnings
warnings.filterwarnings('ignore')
# Loading Dataset - Reading the data as a data frame
df = pd.read_csv('vehicle.csv')
print (df)
# Finding Shape of data
# Inference : There are 846 rows and 19 columns of data
df.shape
# Finding Type
df.dtypes
# Viewing first 10 rows of data
df.head(10)
#Label Encoding the class variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
columns = df.columns
print(columns)
df['class'] = le.fit_transform(df['class'])
df.shape
# Viewing first 10 rows of data after label encoding the class variable
df.head(10)
# Checking presence of missing values
df.isna().apply(pd.value_counts)
df.info()
# Finding and treating missing values
from sklearn.impute import SimpleImputer
newdf = df.copy()
X = newdf.iloc[:,0:19] #separting all numercial independent attribute
imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1)
transformed_values = imputer.fit_transform(X)
column = X.columns
print(column)
newdf = pd.DataFrame(transformed_values, columns = column )
newdf.describe().T
# Checking the results after imputing missing values
print("Original null value count:\n", df.isnull().sum())
print("\n\nCount after imputing null values:\n", newdf.isnull().sum())
Univariate Analysis is done to understand the attributes in given dataset. Then, Bivariate analysis is performed to find the relationship between attributes
Below steps are followed:
# Descriptive Statistics Summary of dataframe after imputing null values
newdf.describe().T
# Finding shape of dataframe after imputing null values
newdf.shape
#Ploting histogram to check that if data columns are normal or almost normal or not
plt.style.use('seaborn-whitegrid')
newdf.hist(figsize=(15,15), color='blue', edgecolor = 'black')
plt.show()
# Measuring skewness
newdf.skew(axis = 0, skipna = True)
#Summary View of all attributes using boxplot
ax = sns.boxplot(data=newdf, orient="h")
# Box Plot to check outliers
plt.figure(figsize= (5,30))
plt.subplot(18,1,1)
sns.boxplot(x= newdf['compactness'], color='blue')
plt.subplot(18,1,2)
sns.boxplot(x= newdf['circularity'], color='blue')
plt.subplot(18,1,3)
sns.boxplot(x= newdf['distance_circularity'], color='blue')
plt.subplot(18,1,4)
sns.boxplot(x= newdf['radius_ratio'], color='blue')
plt.subplot(18,1,5)
sns.boxplot(x= newdf['pr.axis_aspect_ratio'], color='blue')
plt.subplot(18,1,6)
sns.boxplot(x= newdf['max.length_aspect_ratio'], color='blue')
plt.subplot(18,1,7)
sns.boxplot(x= newdf['scatter_ratio'], color='blue')
plt.subplot(18,1,8)
sns.boxplot(x= newdf['elongatedness'], color='blue')
plt.subplot(18,1,9)
sns.boxplot(x= newdf['pr.axis_rectangularity'], color='blue')
plt.subplot(18,1,10)
sns.boxplot(x= newdf['max.length_rectangularity'], color='blue')
plt.subplot(18,1,11)
sns.boxplot(x= newdf['scaled_variance'], color='blue')
plt.subplot(18,1,12)
sns.boxplot(x= newdf['scaled_variance.1'], color='blue')
plt.subplot(18,1,13)
sns.boxplot(x= newdf['scaled_radius_of_gyration'], color='blue')
plt.subplot(18,1,14)
sns.boxplot(x= newdf['scaled_radius_of_gyration.1'], color='blue')
plt.subplot(18,1,15)
sns.boxplot(x= newdf['skewness_about'], color='blue')
plt.subplot(18,1,16)
sns.boxplot(x= newdf['skewness_about.1'], color='blue')
plt.subplot(18,1,17)
sns.boxplot(x= newdf['skewness_about.2'], color='blue')
plt.subplot(18,1,18)
sns.boxplot(x= newdf['hollows_ratio'], color='blue')
plt.show()
radius_ratio, pr.axis_aspect_ratio, max.length_aspect_ratio, scaled_variance,scaled_variance.1, scaled_radius_of_gyration.1, skewness_about, skewness_about.1 are the attributes with outliers which is visible with all dotted points
# Treating outliers with IQRs
from scipy.stats import iqr
Q1 = newdf.quantile(0.25)
Q3 = newdf.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
# Since number of outliers are less, all the attributes which had outliers have been treated and removed.
cleandf = newdf[~((newdf < (Q1 - 1.5 * IQR)) |(newdf > (Q3 + 1.5 * IQR))).any(axis=1)]
# Plotting the Box Plot to check whether outliers are removed
ax = sns.boxplot(data=cleandf, orient="h")
# Box Plot after removing outliers
plt.figure(figsize= (5,30))
plt.subplot(18,1,1)
sns.boxplot(x= cleandf['compactness'], color='blue')
plt.subplot(18,1,2)
sns.boxplot(x= cleandf['circularity'], color='blue')
plt.subplot(18,1,3)
sns.boxplot(x= cleandf['distance_circularity'], color='blue')
plt.subplot(18,1,4)
sns.boxplot(x= cleandf['radius_ratio'], color='blue')
plt.subplot(18,1,5)
sns.boxplot(x= cleandf['pr.axis_aspect_ratio'], color='blue')
plt.subplot(18,1,6)
sns.boxplot(x= cleandf['max.length_aspect_ratio'], color='blue')
plt.subplot(18,1,7)
sns.boxplot(x= cleandf['scatter_ratio'], color='blue')
plt.subplot(18,1,8)
sns.boxplot(x= cleandf['elongatedness'], color='blue')
plt.subplot(18,1,9)
sns.boxplot(x= cleandf['pr.axis_rectangularity'], color='blue')
plt.subplot(18,1,10)
sns.boxplot(x= cleandf['max.length_rectangularity'], color='blue')
plt.subplot(18,1,11)
sns.boxplot(x= cleandf['scaled_variance'], color='blue')
plt.subplot(18,1,12)
sns.boxplot(x= cleandf['scaled_variance.1'], color='blue')
plt.subplot(18,1,13)
sns.boxplot(x= cleandf['scaled_radius_of_gyration'], color='blue')
plt.subplot(18,1,14)
sns.boxplot(x= cleandf['scaled_radius_of_gyration.1'], color='blue')
plt.subplot(18,1,15)
sns.boxplot(x= cleandf['skewness_about'], color='blue')
plt.subplot(18,1,16)
sns.boxplot(x= cleandf['skewness_about.1'], color='blue')
plt.subplot(18,1,17)
sns.boxplot(x= cleandf['skewness_about.2'], color='blue')
plt.subplot(18,1,18)
sns.boxplot(x= cleandf['hollows_ratio'], color='blue')
plt.show()
'''
Dropping Class column and see the correlation Matrix & Pairplot Before using this dataframe for PCA
as PCA should only be perfromed on independent attribute
'''
cleandf= newdf.drop('class', axis=1)
cleandf.columns
'''
Using Pearson Correlation Coefficient to see what all attributes are linearly related and also visualize the same
in the seaborns scatter plot
'''
fig, axs = plt.subplots(figsize=(10,10))
sns.heatmap(cleandf.corr(), annot=True, linewidths=.8, ax=axs)
elongatedness and pr.axis_rectangularity seems to have strong negative correlation, val: 0.95
max_length_aspect_ratio and radius_ratio have average correlation
# Pair Plot
sns.pairplot(cleandf.iloc[:,1:])
scaled variance and scaled variance.1 ; elongatedness and pr.axis_rectangularity are strongly correlated. Hence, they need to be dropped or treated carefully before model building
Our objective is to identify the vehicle based on input features. Hence, our main assumption is there is little or no multicollinearity between the features
From above correlation matrix, we could find that there are features with more than 0.9 correlation. so we can decide to get rid of these columns whose correlation is +-0.9 or above.There are 8 such columns:
Another observation is more than 50 % of attributes are highly correlated. The easiest way to deal with Multicolleniarity is to delete or eliminate one of the perfectly correlated features. (Ex: Scaled Variance & Scaled Variance.1 are having strong positive correlation; One can be picked and another one can be dropped.Similarly , elongatedness and pr.axis_rectangularity we can pick one as they have very strong negative correlation). This approach can be used to select the feature we want to carry forward for model analysis.
However, there is a better approach called PCA - Principle Component Analysis which can be used for dimensions reduction
#ploting the count of the target variable
sns.countplot(newdf['class'])
plt.show()
array = newdf.values
X = array[:,0:18] # select all rows and first 18 columns which are the attributes
Y = array[:,18] # select all rows and the 19th column which is the class variable
test_size = 0.30 # taking 70:30 training and test set
# Scaling of independent variable data using standard scaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_std = sc.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_std, Y, test_size=test_size, random_state=1) # To set the random state
type(X_train)
svc = SVC() #instantiate the object
#fit the model on orighinal raw data
svc.fit(X_train,y_train)
#predict the y value
Orig_y_predict = svc.predict(X_test)
print("Model Score On Original Data - training dataset ",svc.score(X_train, y_train))
print("Model Score On Original Data - test dataset ",svc.score(X_test, y_test))
print("Accuracy Score On Original Data",accuracy_score(y_test, Orig_y_predict))
print(metrics.confusion_matrix(y_test, Orig_y_predict))
Accuracy on test data set using support vector machine on original data is 0.952755905511811
from sklearn.model_selection import KFold
scores = []
best_svc = SVC(kernel='rbf')
cv = KFold (n_splits=10, random_state=1, shuffle=False)
for train_index, test_index in cv.split(X_std):
X_train, X_test, y_train, y_test = X_std[train_index], X_std[test_index], Y[train_index], Y[test_index]
best_svc.fit(X_train, y_train)
scores.append(best_svc.score(X_test, y_test))
print(scores)
The cross validation scores on test data of original dataset (raw data) after performing K-fold cross validation for K=10 is as shown above. </br>
The minimum accuracy score is 0.9411764705882353 and maximum accuracy score is 0.9882352941176471
- Split our data into train and test data set (already performed in section-3 above)
- normalize the training set using standard scalar (already performed in section-3 above)
- Create the covariance matrix
- Calculate the eigenvectors and their eigenvalues
- Sort the eigenvectors according to their eigenvalues in descending order
- Choose the first K eigenvectors (K - required dimension)
- Build new dataset with reduced dimensionality
# Creating covariance matrix
cov_matrix = np.cov(X_std.T)
print("cov_matrix shape:",cov_matrix.shape)
print("Covariance_matrix",cov_matrix)
# Calculating Eigen Vectors & Eigen Values: Using numpy linear algebra function
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
print('Eigen Vectors \n%s', eigenvectors)
print('\n Eigen Values \n%s', eigenvalues)
# Sorting eigenvalues in descending order
# Make a set of (eigenvalue, eigenvector) pairs:
eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]
# Sort the (eigenvalue, eigenvector) pairs from highest to lowest with respect to eigenvalue
eig_pairs.sort()
eig_pairs.reverse()
print(eig_pairs)
# Extract the descending ordered eigenvalues and eigenvectors
eigvalues_sorted = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eigenvalues))]
# print out eigenvalues
print('Eigenvalues in descending order: \n%s' %eigvalues_sorted)
tot = sum(eigenvalues)
var_explained = [(i / tot) for i in sorted(eigenvalues, reverse=True)] # an array of variance explained by each
# eigen vector... there will be 18 entries as there are 18 eigen vectors)
cum_var_exp = np.cumsum(var_explained) # an array of cumulative variance. There will be 18 entries with 18 th entry
# cumulative reaching almost 100%
# Plotting The Explained Variance and Principal Components
plt.bar(range(1,19), var_explained, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1,19),cum_var_exp, where= 'mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
plt.show()
- 8 dimensions are used to explain 95% of variation in original data
# Building new data set with reduced dimensionality
P_reduce = np.array(eigvectors_sorted[0:8]) # Reducing from 8 to 4 dimension space
X_std_8D = np.dot(X_std,P_reduce.T) # projecting original data into principal component dimensions
reduced_pca = pd.DataFrame(X_std_8D) # converting array to dataframe for pairplot
reduced_pca
# Pairplot Of Reduced Dimensions After PCA
sns.pairplot(reduced_pca, diag_kind='kde')
After dimensionality reduction using PCA, the attributes have become independent with no correlation among themselves. As most of them have cloud of data points with no linear relationship
# Building Support Vector Classifier Model with the 8 new reduced variables constructed using PCA
# Spliting the dataset into training and test set in the ratio of 70:30
pca_X_train,pca_X_test,pca_y_train,pca_y_test = train_test_split(reduced_pca,Y,test_size=0.30,random_state=1)
#Fitting the model on PCA data with new dimensions
svc1 = SVC()
svc1.fit(pca_X_train,pca_y_train)
#predicting the y value
pca_y_predict = svc1.predict(pca_X_test)
print("Model Score On Reduced PCA Dimension ",svc1.score(pca_X_test, pca_y_test))
print("Accuracy score after PCA(On 8 dimensions)",accuracy_score(pca_y_test,pca_y_predict))
#Printing Confusion matrix
print(metrics.confusion_matrix(pca_y_test,pca_y_predict))
# Performing K-fold cross validation using principal components
from sklearn.model_selection import KFold
pca_scores = []
pca_svc = SVC(kernel='rbf')
pca_cv = KFold (n_splits=10, random_state=1, shuffle=False)
for pcatrain_index, pcatest_index in pca_cv.split(X_std_8D):
X_train, X_test, y_train, y_test = X_std_8D[pcatrain_index], X_std_8D[pcatest_index], Y[pcatrain_index], Y[pcatest_index]
pca_svc.fit(X_train, y_train)
pca_scores.append(pca_svc.score(X_test, y_test))
print(pca_scores)
Using support vector classifier:
Considering that original dataframe had 18 dimensions, after PCA, dimensions got reduced to 8, SVM model has fared well in terms of accuracy score
[[ 57 1 1] [ 1 129 3] [ 6 2 54]]
SVM model on original data set has correctly classified :
- 57 van out of 59 actuals vans and has errored only in 2 cases
- 129 cars out of 133 actual cars
- 54 buses out of 62 actual buses
Wrong classification is minimal
[[ 57 1 1] [ 2 126 5] [ 5 2 55]]
SVM model on original data set has correctly classified :
- 57 van out of 59 actuals vans
- 126 cars out of 133 actual cars
- 55 buses out of 62 actual buses
By using only 8 out of 18 dimensions, wrong classification is minimal and results are comparable with the one we got for original data